In this document, we will analyse DESeq2 results via a functional enrichment analysis
Set working directory and load libraries
#setwd("~/Desktop/rnaseq/")
setwd("~/Documents/students_MSc/clara/rnaseq/")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gprofiler2)
Let’s load the results files we need.
# the list of DEG results from the previous exercises
res<-readRDS("./results/deseq2_regions_results.rds")
# the annotations
xtrop<-read_csv("./xtr109/diamondblast109.csv")
## Rows: 24392 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): gene_id, transcript_id, peptide_id, xenx_pep_id, xenx_gene_symbol, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ideally, the annotations of your genes should come from experimental evidence from your organism. If you work with mice, Drosophila or humans for example, many functional enrichment analysis tools will be very easy to implement because they will automatically connect your lists of genes to annotations, background lists etc. This is unlikely going to exist if you work with non-model systems. We are therefore dependent on making our own annotations and lists.
For our annotations, we are using BLAST results from querying our genes against the proteome of Xenopus tropicalis. This is a well studied frog species. It is still only distantly related to our focal species, but this is the best we can do. As we saw in the previous exercise, this results in many genes not having any annotations. These will be excluded unfortunately.
pull out DEGs and convert to xenopus IDs
pull_genes<-function(x, alpha=0.05, lfc=0, signed="both", feature="xenp_gene_symbol") {
# filter by adjusted p
x<-x %>%
filter(padj<alpha)
if(signed=="up"){
x<-x %>%
filter(log2FoldChange>lfc) %>%
pull(feature)
}
if(signed=="down"){
x<-x %>%
filter(log2FoldChange<(lfc*-1)) %>%
pull(feature)
}
if(signed=="both"){
x<-x %>%
filter(abs(log2FoldChange)>lfc) %>%
pull(feature)
}
## deal with multiple gene annotations for the same gene (different transcripts)
x<-strsplit(x, ";") %>% unlist() %>% unique()
x<-x[!is.na(x)]
return(x[!is.na(x)])
}
# now extract all
sig_deg<-lapply(res, FUN=function(x)
x %>%
as_tibble(rownames = "gene_id") %>%
left_join(xtrop) %>%
pull_genes(feature = "xenp_pep_id", alpha = 0.05, lfc=0, signed = "both")
)
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
str(sig_deg)
## List of 2
## $ central: chr [1:104] "ENSXETP00000106396" "ENSXETP00000089706" "ENSXETP00000118428" "ENSXETP00000118788" ...
## $ south : chr [1:774] "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000101223" "ENSXETP00000013093" ...
There is some discussion about what makes a good background. Ideally, it should be the complete list of genes that could be differentially expressed. But what is this?
We will use the full set of genes that were returned by
DESeq2. This set should have filtered out genes that have
low counts (i.e. unlikely to be expressed across any of our
tissues/conditions).
We can use the same function from earlier to convert our list of Pelobates IDs to Xenopus peptide IDs.
# function to pull out xtr background IDs
extract_xtr<-function(x) {
return(
xtrop %>%
filter(gene_id %in% x) %>%
pull(xenp_pep_id) %>%
str_split(pattern=";") %>%
unlist() %>%
na.omit() %>%
unique()
)
}
xtr_bg<-lapply(res, FUN= function(x) extract_xtr(rownames(x)))
str(xtr_bg)
## List of 2
## $ central: chr [1:15208] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ south : chr [1:15208] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
# lets unify the background to use the same across all populations
xtr_bg_all<-xtr_bg %>%
unlist() %>%
unique()
We are now ready to go! There are a number of software and R packages that let you perform functional enrichment analysis. Here, we will use [g:Profiler])(https://biit.cs.ut.ee/gprofiler/), because it plays particularly well with R and with Ensembl gene/peptide annotations.
# set base url:
set_base_url("https://biit.cs.ut.ee/gprofiler_archive3/e109_eg56_p17/")
# run the analysis
res_ora<-gost(multi_query = FALSE, # returns separate results tables for multiquery
custom_bg = xtr_bg_all, # our background
query=sig_deg, # our list of gene sets
organism="xtropicalis", # the organism our annotations belong to
exclude_iea = FALSE, # include GO terms that were electronically assigned
correction_method = "gSCS", # the recommended multiple testing correction.
sources=c("GO:BP","GO:CC","GO:MF", "KEGG","REAC"), # the functional sets we are interested in
evcodes=FALSE, ## evcodes TRUE needed for downstream analysis like enrichment maps in Cytoscape, but this takes longer.
significant= FALSE) # return all terms, not just the significant ones
## Detected custom background input, domain scope is set to 'custom'.
# the results are stored as a "results" dataframe
head(res_ora$result)
## query significant p_value term_size query_size intersection_size
## 1 central FALSE 0.05112653 4568 98 56
## 2 central FALSE 0.07075655 4231 98 53
## 3 central FALSE 0.07532821 3871 98 50
## 4 central FALSE 0.87262500 1248 98 23
## 5 central FALSE 1.00000000 120 98 1
## 6 central FALSE 1.00000000 17 98 1
## precision recall term_id source
## 1 0.57142857 0.012259194 GO:0065007 GO:BP
## 2 0.54081633 0.012526589 GO:0050789 GO:BP
## 3 0.51020408 0.012916559 GO:0050794 GO:BP
## 4 0.23469388 0.018429487 GO:0009889 GO:BP
## 5 0.01020408 0.008333333 GO:0042127 GO:BP
## 6 0.01020408 0.058823529 GO:0042113 GO:BP
## term_name effective_domain_size
## 1 biological regulation 13709
## 2 regulation of biological process 13709
## 3 regulation of cellular process 13709
## 4 regulation of biosynthetic process 13709
## 5 regulation of cell population proliferation 13709
## 6 B cell activation 13709
## source_order parents
## 1 16811 GO:0008150
## 2 14004 GO:00081....
## 3 14008 GO:00099....
## 4 3827 GO:00090....
## 5 10550 GO:00082....
## 6 10540 GO:0046649
We can use a p_value cutoff of 0.05 to see how many terms have been functionally enriched in each term.
res_ora$result %>%
filter(p_value<0.05) %>%
group_by(query) %>%
dplyr::count(query, sort=TRUE)
## # A tibble: 2 × 2
## # Groups: query [2]
## query n
## <chr> <int>
## 1 south 12
## 2 central 5
gostplot(res_ora)
custom dot plot using the gprofiler results tables and ggplot.
res_ora$result %>%
select(query,term_name, p_value, intersection_size, query_size,source) %>%
filter(p_value<0.05) %>%
mutate(GeneRatio=intersection_size/query_size) %>%
arrange(GeneRatio) %>%
mutate(term_name = factor(term_name, levels=unique(term_name))) %>%
ggplot(aes(x=GeneRatio, y=term_name)) +
geom_point(aes(color=p_value, size=intersection_size)) +
ylab("") +
#scale_color_scico(palette = "batlow", direction = -1) +
facet_grid(source~query,scales = "free_y",space = "free") +
theme_bw()